In [1]:
import requests # to make GET request
from bs4 import BeautifulSoup # to parse the HTML response
import time # to pause between calls
import pandas as pd # to see CSV
import os
import re
os.chdir('../data/')
In [2]:
columns=['post id','title','text','href','user id','mother post id']
df = pd.DataFrame(columns=columns)
columns=['user id','user description']
df_users = pd.DataFrame(columns=columns)
# Initialize post index
post_id=0
In [3]:
def add_post(post_id,title,text,url,mother_post_id,user_id,user_name):
global df,df_users
#
# Add post data to dataframe
#
newrow={"post id":post_id,
"title":title,
"text":text,
"href":url,
"user id":user_id,
"mother post id":mother_post_id}
df.loc[len(df.values)]=newrow
# Update user dataframe:
#
newrow={"user id":user_id,
"user description":user_name}
if user_id not in df_users['user id'].values:
df_users.loc[len(df_users)]=newrow
#url="http://ehealthforum.com/health/autism-recovery-success-story-t351300.html"
def parse_post(url):
global df,df_users,post_id
user_href_prefix="http://ehealthforum.com/health/user_profile_"
p = re.compile('\n+')
response = requests.get(url)
page_source = response.text
soup = BeautifulSoup(page_source, 'html5lib')
mother_post_id=post_id
post_id=post_id+1
#
# Extract data from soup:
#
post = soup.find("div", class_="vt_h2")
title=post.find("h1", class_="caps").text
#
# Mother post:
post1=soup.find("div", class_="vt_first_message_body")
#
#
# User
#
user_data = soup.find("span",class_="vt_asked_by_user")
user_name = user_data.text
user_id = user_data.find("a")['href'].replace(user_href_prefix,"").replace(".html","")
user_description = soup.find("span",class_="vt_user_rank").text
user_name=user_name+" "+user_description
#
# Post text
#
text=post1.text.replace("\t","")
text=p.sub('\n', text)
#
# Add post to dataframe
#
add_post(post_id,title,text,url,mother_post_id,user_id,user_name)
#
# Follow up to that message
#messages=soup.find_all("div", class_="vt_message_body")
#for message in messages:
# post_id=post_id+1
# text=message.text.replace("\t","")
# text=p.sub('\n', text)
postrows=soup.find_all("div",class_="vt_postrow_rest")
for postrow in postrows[:len(postrows)-1]:
post_id=post_id+1
text=postrow.find("div",class_="vt_post_body").text
text=text.replace("\t","")
text=p.sub('\n', text)
user_id=postrow.find("a")['href'].replace(user_href_prefix,"").replace(".html","")
user_name=postrow.find("div",class_="vt_asked_by_user").text
user_description=postrow.find("span",class_="vt_user_rank").text
user_name=user_name+" "+user_description
add_post(post_id,title,text,url,mother_post_id,user_id,user_name)
In [ ]:
df_users
Out[ ]:
In [ ]:
url_list=["http://ehealthforum.com/health/autism.html",
"http://ehealthforum.com/health/autism_medical_questions_242_0_50.html",
"http://ehealthforum.com/health/autism_medical_questions_242_0_100.html",
"http://ehealthforum.com/health/autism_medical_questions_242_0_150.html",
"http://ehealthforum.com/health/autism_medical_questions_242_0_200.html"]
for url in url_list:
response = requests.get(url)
page_source = response.text
soup = BeautifulSoup(page_source, 'html5lib')
content=soup.find_all("div",class_="fp_topic_content_title")
for topic in content:
url=topic.find("a", class_='topictitle')['href']
print(url)
parse_post(url)
In [ ]:
df.to_csv('ehealthforum-posts.csv',index=False)
df_users.to_csv('ehealthforum-users.csv',index=False)
In [ ]:
df.tail()